Bike Share for All
Baywheels offers all residents of the Bay Area an affordable, accessible, and fun new transportation option. Low-income residents qualify for a discounted membership.
I am interested in the following points:
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import os
from pathlib import Path
import zipfile
import glob
import matplotlib
import datetime
from mpl_toolkits.basemap import Basemap
from matplotlib.gridspec import GridSpec
%matplotlib inline
Load in your dataset and describe its properties through the questions below. Try and motivate your exploration goals through this section.
# how to download data automatically?
url="https://s3.amazonaws.com/baywheels-data/"
start_year=2018
end_year=2021
for year in range(start_year, end_year):
for mon in range(1,13):
if (year == 2020) & (mon > 7 ):
break
if ( year <= 2019) & (mon < 5) :
if mon < 10 :
mon = "0"+str(mon)
my_file = Path("./"+str(year)+str(mon)+"-fordgobike-tripdata.csv.zip")
if my_file.is_file():
continue
else:
cmd = "wget "+url+str(year)+str(mon)+"-fordgobike-tripdata.csv.zip"
print(cmd)
os.system(cmd)
else:
if mon < 10 :
mon = "0"+str(mon)
my_file = Path("./"+str(year)+str(mon)+"-baywheels-tripdata.csv.zip")
if my_file.is_file():
continue
else:
cmd = "wget "+url+str(year)+str(mon)+"-baywheels-tripdata.csv.zip"
print(cmd)
os.system(cmd)
!pwd
# read a sample file:
zf = zipfile.ZipFile('./201803-fordgobike-tripdata.csv.zip')
data = pd.read_csv(zf.open('201803-fordgobike-tripdata.csv'))
data.sample(10)
data.dtypes
data.isnull().sum(axis = 0)
# check if it is removed
data.sample(10)
In this section, investigate distributions of individual variables. If you see unusual points or outliers, take a deeper look to clean things up and prepare yourself to look at relationships between variables.
# A simple box plot to see the distributions:
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size
ax = data.boxplot()
plt.xticks( rotation='vertical')
plt.xlabel('Variables', fontsize=20)
plt.ylabel('Values', fontsize=20)
plt.title('A simple box plot to see the distributions', fontsize=20)
perc_95 = np.percentile(data.duration_sec,99)
n, bins, patches = plt.hist(data.duration_sec[data.duration_sec<perc_95], 20,
color="Grey" ,density=1, alpha=.5);
plt.title("Ride duration in sec greater than 99 percentile\n ", fontsize=20)
plt.xlabel("duration in sec", fontsize=20)
plt.ylabel("Density", fontsize=20)
max_duration = data.duration_sec[data.duration_sec>perc_95].max()
#print(data[data.duration_sec == max_duration ])
print()
print("Duration in hours ----------------: "+
str(data[data.duration_sec == max_duration].duration_sec.values[0]/3600.))
data[data.duration_sec == max_duration ]
# plot the histogram again for User type
n, bins, patches = plt.hist(data.duration_sec[(data.user_type == "Customer") & (data.duration_sec<perc_95)],
50, label='data.user_type == "Customer"',
color="Blue" ,density=1, alpha=.5);
n, bins, patches = plt.hist(data.duration_sec[(data.user_type == "Subscriber") & (data.duration_sec<perc_95)],
50,label='data.user_type == "Subscriber"',
color="red" ,density=1, alpha=.5);
plt.title("Ride duration in sec greater than 99 percentile\n ", fontsize=20)
plt.xlabel("duration in sec", fontsize=20)
plt.ylabel("Density", fontsize=15)
plt.legend(fontsize=20)
set(data.user_type)
Isues with the data tidiness
Merging the whole dataset
Now I will try to merge the whole dataset together to make life easy for myself and store it on the disk :
data_total =[]
for i in sorted(glob.glob("./*csv.zip")):
print(i)
zf = zipfile.ZipFile(i)
data = pd.read_csv(zf.open(i[2:-4]), dtype='unicode')
# Do not need the miliseconds!
print(' ',i,int(i[2:6]), int(i[7:8]))
if (int(i[2:6]) == 2020 ) & (int(i[7:8]) > 3):
data['started_at'] = [d[0:19] for d in data['started_at']]
data['ended_at'] = [d[0:19] for d in data['ended_at']]
data['start_new_date'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").date() for d in data['started_at']]
data['end_new_date'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").date() for d in data['ended_at']]
data['start_new_time'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").time() for d in data['started_at']]
data['end_new_time'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").time() for d in data['ended_at']]
data['start_dow'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").weekday() for d in data['started_at']]
data['end_dow'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").weekday() for d in data['ended_at']]
data['duration_sec'] = [ (datetime.datetime.strptime(data['ended_at'].iloc[i],
"%Y-%m-%d %H:%M:%S" )- datetime.datetime.strptime(data['started_at'].iloc[i],
"%Y-%m-%d %H:%M:%S")).total_seconds() for i in range(len(data['started_at']))]
data['end_station_latitude'] = data['end_lat']
data['end_station_longitude'] = data['end_lng']
data['start_station_longitude'] = data['start_lng']
data['start_station_latitude'] = data['start_lat']
data.drop(['start_lat','end_lat','started_at','ended_at', 'end_lng', 'start_lng'], axis=1, inplace=True)
else:
data['start_time'] = [d[0:19] for d in data['start_time']]
data['end_time'] = [d[0:19] for d in data['end_time']]
data['start_new_date'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").date() for d in data['start_time']]
data['end_new_date'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").date() for d in data['end_time']]
data['start_new_time'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").time() for d in data['start_time']]
data['end_new_time'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").time() for d in data['end_time']]
data['start_dow'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").weekday() for d in data['start_time']]
data['end_dow'] = [datetime.datetime.strptime(d,
"%Y-%m-%d %H:%M:%S").weekday() for d in data['end_time']]
data.drop(['start_time','end_time'], axis=1, inplace=True)
data_total.append(data)
appended_data = pd.concat(data_total)
appended_data.dtypes
# Now check if it worked:
appended_data.sample(10)
# save the dataframe:
appended_data.to_pickle("./data_final.pkl")
# unpickle the data :
df = pd.read_pickle("./data_final.pkl")
checking the merged data Now I check the merged data:
# first drop the changed columns :
#df.drop(['start_time', 'started_at', 'end_time', 'ended_at'], axis=1, inplace=True)
df.dtypes
df.shape
df.isnull().sum(axis = 0)
I do not need the bike_id, ride_id, start_lng , end_lng ,
df.drop(['bike_id','ride_id', 'end_lng', 'start_lng'], axis=1, inplace=True)
df.sample(10)
df.columns
I decide for the following variables for my analysis:
duration_sec , I can do some analysis showing where and where the duration is increasing. this is what brings money for the company!
start and end day of week
start and end longitude and latitudes
start and end time and date
# lets select the abovementioned variables:
variables_to_keep = ['duration_sec', 'start_dow','end_dow','start_new_date', 'end_new_date',
'start_new_time','end_new_time',
'start_station_latitude', 'end_station_latitude','start_station_longitude',
'end_station_longitude']
df_final = df[variables_to_keep]
df_final.head()
df_final.tail()
df_final.isnull().sum(axis = 0)
there are still some nans in end_station_lon and lat. I decide to drop these:
# drop the nans :
df_final.dropna(inplace=True)
# check it out :
df_final.isnull().sum(axis = 0)
df_final.dtypes
to_be_float = ["duration_sec","end_dow","start_dow","start_station_latitude","start_station_longitude",
"end_station_latitude","end_station_longitude"]
df_final[to_be_float] = df_final[to_be_float].astype(str).astype(float)
# check the result:
df_final.dtypes
df_final.describe()
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size
ax = df_final.boxplot()
plt.xticks( rotation='vertical')
plt.xlabel('Variables', fontsize=20)
plt.ylabel('Values', fontsize=20)
plt.title('A simple box plot to see the distributions', fontsize=20)
As it could be seen the duration in sec is showinh unusual values (ouliers! ) let's see what are these?
df_99p = df_final[df_final.duration_sec > np.percentile(df_final.duration_sec,99)] # greater than 99 ßercentile
df_99p.start_new_time.count()
I decide to drop extreme values, greater than 99 percentile:
df_final = df_final[df_final.duration_sec < np.percentile(df_final.duration_sec,99)] # smaller than 99 ßercentile
# plot again :
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size
ax = df_final.boxplot()
plt.xticks( rotation='vertical')
plt.xlabel('Variables', fontsize=20)
plt.ylabel('Values', fontsize=20)
plt.title('A simple box plot to see the distributions', fontsize=20)
the question is what are the negative values in duration sec ?
df_minus_duration = df_final[df_final.duration_sec <0]
df_minus_duration.shape
I will drop these as well!
df_final = df_final[df_final.duration_sec > 0]
# plot again:
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size
ax = df_final.boxplot()
plt.xticks( rotation='vertical')
plt.xlabel('Variables', fontsize=20)
plt.ylabel('Values', fontsize=20)
plt.title('A simple box plot to see the distributions', fontsize=20);
next steps & Ideas
- The idea is to find patterns like diurnal cycle, monthly cycles and trends like after COVID19 incidence.
- Or relation to the long and latidue
In this section, investigate relationships between pairs of variables in your data. Make sure the variables that you cover here have been introduced in some fashion in the previous section (univariate exploration).
# plot number of rides falling in day of the week :
# for start day :
df_start_dow = df_final.start_dow.value_counts()
# for end day:
df_end_dow = df_final.end_dow.value_counts()
#index = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
index = ["Thursday","Wednesday", "Tuesday","Friday","Monday","Saturday","Sunday"]
df = pd.DataFrame({'start': df_start_dow.values,
'end': df_end_dow.values}, index=df_start_dow.index)
ax = df.plot.barh()
ax.set_yticklabels(index)
plt.title('Number of rides per day of the week from 201801 - 2020-07', fontsize=20)
plt.xlabel('Number of rides',fontsize=15)
there is a weekly cycle, during weekend people will rent less and they rent the bike for going to work !
# split time :
df_final["start_hour"] = [h.hour for h in df_final.start_new_time]
df_final["start_minute"] = [h.minute for h in df_final.start_new_time]
df_final["start_second"] = [h.second for h in df_final.start_new_time]
df_final["end_hour"] = [h.hour for h in df_final.end_new_time]
df_final["end_minute"] = [h.minute for h in df_final.end_new_time]
df_final["end_second"] = [h.second for h in df_final.end_new_time]
df_final.head()
# plot number of rides falling in hour of the day :
# for start time :
df_start_hod = df_final.start_hour.value_counts()
ax = plt.plot(df_start_hod.sort_index(ascending=False).values,'r--o')
plt.title('Number of rides per hour of the day from 201801 - 2020-07', fontsize=20)
plt.xlabel('time',fontsize=15)
plt.ylabel('Number of rides',fontsize=15)
This confirms my hypothesis that people use bikes for going to work and comming back home! See the peaks around 6 and 15 !!!
# split month :
df_final["start_year"] = [h.year for h in df_final.start_new_date]
df_final["start_month"] = [h.month for h in df_final.start_new_date]
df_final["end_year"] = [h.year for h in df_final.end_new_date]
df_final["end_month"] = [h.month for h in df_final.end_new_date]
df_final.head()
# plot number of rides falling in month of the year:
# for start time :
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size
fig, ax = plt.subplots()
df_start_moy = df_final.start_month.value_counts()
plt.plot(df_start_moy.sort_index(),'r--o')
plt.title('Number of rides per month of the year from 201801 - 2020-07', fontsize=20)
plt.xlabel('month',fontsize=15)
plt.ylabel('Number of rides',fontsize=15)
index = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
ax.set_xticks(range(1,13))
ax.set_xticklabels(index);
mon = 0
index = []
val = np.zeros(31)
for year in range(2018,2021):
for month in range(1,13):
if (year == 2020) & (month> 7):
break
val[mon]=df_final[(df_final.start_month==month) & (df_final.start_year==year)].duration_sec.median()
index.append(str(year)+"_"+str(month))
mon += 1
fig, ax = plt.subplots()
plt.plot(range(31),val,'k--o');
ax.set_xticks(range(31))
ax.set_xticklabels(index, rotation = 45);
plt.xlabel("date (per month)",fontsize=15)
plt.ylabel("Median of ride duration (sec)", fontsize=15)
plt.title("Meidan of ride duration per month (sec)", fontsize=20)
now time to dig more into details:
Create plots of three or more variables to investigate your data even further. Make sure that your investigations are justified, and follow from your work in the previous sections.
# plot number of rides falling in hour of the day :
# for start time :
df_start_hod_2018 = df_final[(df_final.start_year==2018) & (df_final.start_month<8)].start_hour.value_counts()
df_start_hod_2019 = df_final[(df_final.start_year==2019) & (df_final.start_month<8)].start_hour.value_counts()
df_start_hod_2020 = df_final[(df_final.start_year==2020) & (df_final.start_month<8)].start_hour.value_counts()
plt.plot(df_start_hod_2018.sort_index(ascending=False).values,'b--o',label="2018")
plt.plot(df_start_hod_2019.sort_index(ascending=False).values,'k--o',label="2019")
plt.plot(df_start_hod_2020.sort_index(ascending=False).values,'r--o',label="2020")
plt.title('Number of rides per hour of the day from 2018 - 2020 \n for the first 7 months', fontsize=20)
plt.xlabel('time',fontsize=15)
plt.ylabel('Number of rides',fontsize=15)
plt.legend(fontsize=20)
interesting that 2020's peaks are less than 2019! people losing their jobs? !!! less tourism?
Start points' duration sec:
N= df_final.shape[0]
#N=500000
#http://qingkaikong.blogspot.com/2016/06/nice-python-basemap-background.html
my_dpi = 200
map = Basemap(llcrnrlon=-122.53, llcrnrlat=37.68,
urcrnrlon=-122.35, urcrnrlat=37.86,resolution='i')
fig = plt.figure(figsize=(24,12))
gs = GridSpec(nrows=1, ncols=3)
gs.update(wspace=0.1, hspace=0.3)
############################################################################ 1
ax0 = fig.add_subplot(gs[0, 0])
map.drawcountries(linewidth=0.5, linestyle='solid', color='black', antialiased=1, ax=None, zorder=1)
map.drawcoastlines()
map.shadedrelief()
#map.etopo()
cmap = plt.cm.get_cmap("jet", 13)
map.arcgisimage(service='World_Topo_Map', xpixels = 1500, verbose= True)
lon = df_final.iloc[:N].start_station_longitude
lat = df_final.iloc[:N].start_station_latitude
xpt,ypt = map(lon,lat)
value = df_final.iloc[:N].duration_sec
cs = plt.scatter(xpt,ypt,edgecolors='none', vmin=0, vmax=1000,cmap=cmap,s=20,
c= value, alpha=.3)
parallels = np.arange(0.,81,.05)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 1", fontsize=30)
plt.tight_layout()
############################################################################## 2
map = Basemap(llcrnrlon=-122.35, llcrnrlat=37.75,
urcrnrlon=-122.17, urcrnrlat=37.91,resolution='i')
ax0 = fig.add_subplot(gs[0, 1])
map.drawcountries(linewidth=0.5, linestyle='solid', color='black', antialiased=1, ax=None, zorder=1)
map.drawcoastlines()
map.shadedrelief()
#map.etopo()
cmap = plt.cm.get_cmap("jet", 13)
map.arcgisimage(service='World_Topo_Map', xpixels = 1500, verbose= True)
value = df_final.iloc[:N].duration_sec
cs = plt.scatter(xpt,ypt,edgecolors='none', vmin=0, vmax=1000,cmap=cmap,s=20,
c= value, alpha=.3)
parallels = np.arange(0.,81,.05)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 2", fontsize=30)
plt.tight_layout()
############################################################################## 3
ax0 = fig.add_subplot(gs[0, 2])
#--------------------------------------------------------------
map = Basemap(llcrnrlon=-122.0, llcrnrlat=37.25,
urcrnrlon=-121.75, urcrnrlat=37.44,resolution='i')
map.drawcountries(linewidth=0.5, linestyle='solid', color='black', antialiased=1, ax=None, zorder=1)
map.drawcoastlines()
map.shadedrelief()
cmap = plt.cm.get_cmap("jet", 13)
map.arcgisimage(service='World_Topo_Map', xpixels = 1500, verbose= True)
value = df_final.iloc[:N].duration_sec
cs = plt.scatter(xpt,ypt,edgecolors='none', vmin=0, vmax=1000,cmap=cmap,s=20,
c= value, alpha=.3)
parallels = np.arange(0.,81,.05)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 3", fontsize=30)
plt.tight_layout()
fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(cs, cax=cbar_ax)
plt.savefig('Start_points_duration_sec.png', format='png');
#N= df_final.shape[0]
#http://qingkaikong.blogspot.com/2016/06/nice-python-basemap-background.html
my_dpi = 200
map = Basemap(llcrnrlon=-122.53, llcrnrlat=37.68,
urcrnrlon=-122.35, urcrnrlat=37.86,resolution='i')
fig = plt.figure(figsize=(24,24))
gs = GridSpec(nrows=2, ncols=3)
gs.update(wspace=0.1, hspace=0.1)
############################################################################ 1
ax0 = fig.add_subplot(gs[0, 0])
map.drawcountries(linewidth=0.5, linestyle='solid', color='black', antialiased=1, ax=None, zorder=1)
map.drawcoastlines()
map.shadedrelief()
#map.etopo()
#map.arcgisimage(service='World_Topo_Map', xpixels = 1500, verbose= True)
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[df_final.start_hour == 7].start_station_longitude
lat = df_final[df_final.start_hour == 7].start_station_latitude
xpt,ypt = map(lon,lat)
cs = plt.scatter(xpt,ypt,edgecolors='none',s=10,
c= 'black', alpha=.1)
parallels = np.arange(0.,81,.05)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 1 start at 7 pm", fontsize=30)
plt.tight_layout()
############################################################################## 2
map = Basemap(llcrnrlon=-122.35, llcrnrlat=37.75,
urcrnrlon=-122.17, urcrnrlat=37.91,resolution='i')
ax0 = fig.add_subplot(gs[0, 1])
map.drawcountries(linewidth=0.5, linestyle='solid', color='black', antialiased=1, ax=None, zorder=1)
map.drawcoastlines()
map.shadedrelief()
#map.etopo()
#map.arcgisimage(service='World_Topo_Map', xpixels = 1500, verbose= True)
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[df_final.start_hour == 7].start_station_longitude
lat = df_final[df_final.start_hour == 7].start_station_latitude
xpt,ypt = map(lon,lat)
cs = plt.scatter(xpt,ypt,edgecolors='none',s=10,
c= 'black', alpha=.1)
parallels = np.arange(0.,81,.05)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 2 start at 7 pm", fontsize=30)
plt.tight_layout()
############################################################################## 3
ax0 = fig.add_subplot(gs[0, 2])
#--------------------------------------------------------------
map = Basemap(llcrnrlon=-122.0, llcrnrlat=37.25,
urcrnrlon=-121.75, urcrnrlat=37.44,resolution='i')
map.drawcountries(linewidth=0.5, linestyle='solid', color='black', antialiased=1, ax=None, zorder=1)
map.drawcoastlines()
map.shadedrelief()
#map.etopo()
#map.arcgisimage(service='World_Topo_Map', xpixels = 1500, verbose= True)
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[df_final.start_hour == 7].start_station_longitude
lat = df_final[df_final.start_hour == 7].start_station_latitude
xpt,ypt = map(lon,lat)
cs = plt.scatter(xpt,ypt,edgecolors='none',s=10,
c= 'black', alpha=.1)
parallels = np.arange(0.,81,.05)
plt.title("Region 3 start at 7 pm", fontsize=30)
plt.tight_layout()
######################
map = Basemap(llcrnrlon=-122.53, llcrnrlat=37.68,
urcrnrlon=-122.35, urcrnrlat=37.86,resolution='i')
ax0 = fig.add_subplot(gs[1, 0])
map.drawcountries(linewidth=0.5, linestyle='solid', color='black', antialiased=1, ax=None, zorder=1)
map.drawcoastlines()
map.shadedrelief()
#map.etopo()
#map.arcgisimage(service='World_Topo_Map', xpixels = 1500, verbose= True)
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[df_final.start_hour == 16].start_station_longitude
lat = df_final[df_final.start_hour == 16].start_station_latitude
xpt,ypt = map(lon,lat)
cs = plt.scatter(xpt,ypt,edgecolors='none',s=10,
c= 'red', alpha=.1)
parallels = np.arange(0.,81,.05)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 1 start at 16 am", fontsize=30)
plt.tight_layout()
############################################################################## 2
map = Basemap(llcrnrlon=-122.35, llcrnrlat=37.75,
urcrnrlon=-122.17, urcrnrlat=37.91,resolution='i')
ax0 = fig.add_subplot(gs[1, 1])
map.drawcountries(linewidth=0.5, linestyle='solid', color='black', antialiased=1, ax=None, zorder=1)
map.drawcoastlines()
map.shadedrelief()
#map.etopo()
#map.arcgisimage(service='World_Topo_Map', xpixels = 1500, verbose= True)
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[df_final.start_hour == 16].start_station_longitude
lat = df_final[df_final.start_hour == 16].start_station_latitude
xpt,ypt = map(lon,lat)
cs = plt.scatter(xpt,ypt,edgecolors='none',s=10,
c= 'red', alpha=.1)
parallels = np.arange(0.,81,.05)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 2 start at 16 am", fontsize=30)
plt.tight_layout()
############################################################################## 3
ax0 = fig.add_subplot(gs[1, 2])
#--------------------------------------------------------------
map = Basemap(llcrnrlon=-122.0, llcrnrlat=37.25,
urcrnrlon=-121.75, urcrnrlat=37.44,resolution='i')
map.drawcountries(linewidth=0.5, linestyle='solid', color='black', antialiased=1, ax=None, zorder=1)
map.drawcoastlines()
map.shadedrelief()
#map.etopo()
#map.arcgisimage(service='World_Topo_Map', xpixels = 1500, verbose= True)
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[df_final.start_hour == 16].start_station_longitude
lat = df_final[df_final.start_hour == 16].start_station_latitude
xpt,ypt = map(lon,lat)
cs = plt.scatter(xpt,ypt,edgecolors='none',s=10,
c= 'red', alpha=.1)
parallels = np.arange(0.,81,.05)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 3 start at 16 am", fontsize=30)
plt.tight_layout()
plt.savefig('Start_End_points_movements.png', format='png');
Interesting that there is a moving pattern. but it is more abvious in the region 1. And I will focus with zooming on this region:
in the above figure black points are the start locations at 7 am and red points the start locations at 4 am.
my_dpi = 200
map = Basemap(llcrnrlon=-122.43, llcrnrlat=37.77,
urcrnrlon=-122.38, urcrnrlat=37.8,resolution='f')
fig = plt.figure(figsize=(24,12))
gs = GridSpec(nrows=2, ncols=1)
gs.update(wspace=0.1, hspace=0.3)
############################################################################ 1
ax0 = fig.add_subplot(gs[0, 0])
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[df_final.start_hour == 7].start_station_longitude
lat = df_final[df_final.start_hour == 7].start_station_latitude
xpt,ypt = map(lon,lat)
cs = plt.scatter(xpt,ypt,edgecolors='none',s=10,
c= 'blue', alpha=.5)
parallels = np.arange(0.,81,.01)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 1 start at 7 pm zoom", fontsize=30)
plt.tight_layout()
ax0 = fig.add_subplot(gs[1, 0])
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[df_final.start_hour == 16].start_station_longitude
lat = df_final[df_final.start_hour == 16].start_station_latitude
xpt,ypt = map(lon,lat)
cs = plt.scatter(xpt,ypt,edgecolors='none',s=10,
c= 'red', alpha=.5)
parallels = np.arange(0.,81,.01)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 1 start at 16 am zoom", fontsize=30)
plt.tight_layout()
plt.savefig('Start_End_points_movements_zoom.png', format='png');
map = Basemap(llcrnrlon=-122.43, llcrnrlat=37.77,
urcrnrlon=-122.38, urcrnrlat=37.8,resolution='f')
for hour in range(0,24):
print('plotting '+str(hour))
fig = plt.figure(figsize=(12,12))
############################################################################ 1
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[df_final.start_hour == hour].start_station_longitude
lat = df_final[df_final.start_hour == hour].start_station_latitude
xpt,ypt = map(lon,lat)
cs = plt.scatter(xpt,ypt,edgecolors='none',s=10,
c= 'blue', alpha=.5)
parallels = np.arange(0.,81,.01)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 1 start at "+str(hour)+" pm zoom", fontsize=30)
plt.tight_layout()
if hour < 10:
plt.savefig("Start_points_at_0"+str(hour)+"_zoom.png", format='png');
else:
plt.savefig("Start_points_at_"+str(hour)+"_zoom.png", format='png');
plt.close()
import imageio
with imageio.get_writer('./movie.gif', mode='I', duration=1) as writer:
for filename in sorted(glob.glob("./Start_points_at_*_zoom.png")):
image = imageio.imread(filename)
writer.append_data(image)
from IPython.display import HTML
HTML('<h1> The start locations with time <img src="./movie.gif">')
You see how people are using the bikes to commute to work and go back home
#http://qingkaikong.blogspot.com/2016/06/nice-python-basemap-background.html
map = Basemap(llcrnrlon=-122.43, llcrnrlat=37.77,
urcrnrlon=-122.38, urcrnrlat=37.8,resolution='f')
fig = plt.figure(figsize=(24,12))
gs = GridSpec(nrows=2, ncols=1)
gs.update(wspace=0.1, hspace=0.2)
############################################################################ 1
ax0 = fig.add_subplot(gs[0, 0])
#map.drawcountries(linewidth=0.5, linestyle='solid', color='black', antialiased=1, ax=None, zorder=1)
#map.drawcoastlines()
#map.shadedrelief()
#map.etopo()
cmap = plt.cm.get_cmap("jet", 13)
#map.arcgisimage(service='World_Topo_Map', xpixels = 1500, verbose= True)
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[(df_final.start_year==2019) & (df_final.start_month==4) ].start_station_longitude
lat = df_final[(df_final.start_year==2019) & (df_final.start_month==4) ].start_station_latitude
xpt,ypt = map(lon,lat)
value = df_final[(df_final.start_year==2019) & (df_final.start_month==4) ].duration_sec
cs = plt.scatter(xpt,ypt,edgecolors='none', vmin=0, vmax=1000,cmap=cmap,s=20,
c= value, alpha=.4)
parallels = np.arange(0.,81,.05)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 1 zoom April 2019", fontsize=30)
plt.tight_layout()
## 2020 :
ax0 = fig.add_subplot(gs[1, 0])
#map.drawcountries(linewidth=0.5, linestyle='solid', color='black', antialiased=1, ax=None, zorder=1)
#map.drawcoastlines()
#map.shadedrelief()
#map.etopo()
cmap = plt.cm.get_cmap("jet", 13)
#map.arcgisimage(service='World_Topo_Map', xpixels = 1500, verbose= True)
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[(df_final.start_year==2020) & (df_final.start_month==4) ].start_station_longitude
lat = df_final[(df_final.start_year==2020) & (df_final.start_month==4) ].start_station_latitude
xpt,ypt = map(lon,lat)
value = df_final[(df_final.start_year==2020) & (df_final.start_month==4) ].duration_sec
cs = plt.scatter(xpt,ypt,edgecolors='none', vmin=0, vmax=1000,cmap=cmap,s=20,
c= value, alpha=.4)
parallels = np.arange(0.,81,.05)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title("Region 1 zoom April 2020", fontsize=30)
plt.tight_layout()
cbar_ax = fig.add_axes([0.7, 0.15, 0.03, 0.8])
fig.colorbar(cs, cax=cbar_ax)
plt.savefig('2020_04_vs_2019_04_zoom.png', format='png');
This is really important which shows that there existed a boom in bikesharing in 2020 during the pandemie!!!!
I will plot the differences in bike rides on the map for each year from January to june between 2019 and 2020 :
map = Basemap(llcrnrlon=-122.43, llcrnrlat=37.77,
urcrnrlon=-122.38, urcrnrlat=37.8,resolution='f')
fig = plt.figure(figsize=(12,36))
gs = GridSpec(nrows=7, ncols=2)
gs.update(wspace=0.1, hspace=0.2)
############################################################################ 1
def plot_month(year,month, column, row):
ax0 = fig.add_subplot(gs[row, column])
cmap = plt.cm.get_cmap("jet", 13)
map.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels = 1200, verbose= True)
lon = df_final[(df_final.start_year==year) & (df_final.start_month==month) ].start_station_longitude
lat = df_final[(df_final.start_year==year) & (df_final.start_month==month) ].start_station_latitude
xpt,ypt = map(lon,lat)
#value = df_final[(df_final.start_year==year) & (df_final.start_month==month) ].duration_sec
#cs = plt.scatter(xpt,ypt,edgecolors='none', vmin=0, vmax=1000,cmap=cmap,s=20,
# c= value, alpha=.3)
cs = plt.scatter(xpt,ypt,edgecolors='none',s=10,
c= 'red', alpha=.1)
parallels = np.arange(0.,81,.05)
map.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,.05)
map.drawmeridians(meridians,labels=[True,False,False,True])
plt.title(str(month)+" "+str(year), fontsize=20)
#plt.tight_layout()
#fig.colorbar(cs, cax=cbar_ax)
# now plot the matrix :
plot_month(2019,1, 0, 0)
plot_month(2020,1, 1, 0)
plot_month(2019,2, 0, 1)
plot_month(2020,2, 1, 1)
plot_month(2019,3, 0, 2)
plot_month(2020,3, 1, 2)
plot_month(2019,4, 0, 3)
plot_month(2020,4, 1, 3)
plot_month(2019,5, 0, 4)
plot_month(2020,5, 1, 4)
plot_month(2019,6, 0, 5)
plot_month(2020,6, 1, 5)
plot_month(2019,7, 0, 6)
plot_month(2020,7, 1, 6)
#fig.title('2020 vs 2019',fontsize=20)
plt.savefig('2020_vs_2019.png', format='png');
Now just look at the number of new daily COVID-19 cases in san Fransisco
%%html
<iframe src="https://app.powerbigov.us/view?r=eyJrIjoiYTkwY2Y5NzUtOWQ0MC00ZDljLTk0M2YtNTBkNGExYjkwYjY0IiwidCI6IjIyZDVjMmNmLWNlM2UtNDQzZC05YTdmLWRmY2MwMjMxZjczZiJ9&navContentPaneEnabled=false&filterPaneEnabled=false" width="1000" height="1000"></iframe>
mon = 0
index = []
val = np.zeros(31)
for year in range(2018,2021):
for month in range(1,13):
if (year == 2020) & (month> 7):
break
val[mon]=len(df_final[(df_final.start_month==month) & (df_final.start_year==year)].duration_sec)
index.append(str(year)+"_"+str(month))
mon += 1
fig, ax = plt.subplots(figsize=(16,8))
plt.plot(range(31),val,'k--o');
ax.set_xticks(range(31))
ax.set_xticklabels(index, rotation = 45);
plt.xlabel("date (per month)",fontsize=15)
plt.ylabel("Rides", fontsize=15)
plt.title("Number of rides per month (sec)", fontsize=20)
At the end of your report, make sure that you export the notebook as an html file from the
File > Download as... > HTMLmenu. Make sure you keep track of where the exported file goes, so you can put it in the same folder as this notebook for project submission. Also, make sure you remove all of the quote-formatted guide notes like this one before you finish your report!
len(df_final[(df_final.start_month==month) & (df_final.start_year==year)].duration_sec)
I imagine the tourists are renting bikes for a duration more than 1 hour(3600 seconds). So if there are less 3600 duration cases after the Pandemie, then I conclude there might be less tourists around. So I will chjeck that below:
mon = 0
index = []
val = np.zeros(31)
for year in range(2018,2021):
for month in range(1,13):
if (year == 2020) & (month> 7):
break
val[mon]=len(df_final[(df_final.start_month==month) & (df_final.start_year==year) & (df_final.duration_sec >= 3600)].duration_sec)
index.append(str(year)+"_"+str(month))
mon += 1
fig, ax = plt.subplots(figsize=(16,8))
plt.plot(range(31),val,'k--o');
ax.set_xticks(range(31))
ax.set_xticklabels(index, rotation = 45);
plt.xlabel("date (per month)",fontsize=15)
plt.ylabel("Rides", fontsize=15)
plt.title("Number of rides per month (sec) for durations > 3600 sec.", fontsize=20)
Well I could not proof my hypothesis and the duration has somthing with 2020
maybe more bike available at the streets or more related to the pandemie. No I will check if the median of the duration in seconds for durations greater than 3600 (tourists) also shows some trends?
mon = 0
index = []
val = np.zeros(31)
for year in range(2018,2021):
for month in range(1,13):
if (year == 2020) & (month> 7):
break
val[mon]=np.median(df_final[(df_final.start_month==month) & (df_final.start_year==year) & (df_final.duration_sec >= 3600)].duration_sec)
index.append(str(year)+"_"+str(month))
mon += 1
fig, ax = plt.subplots(figsize=(16,8))
plt.plot(range(31),val,'k--o');
ax.set_xticks(range(31))
ax.set_xticklabels(index, rotation = 45);
plt.xlabel("date (per month)",fontsize=15)
plt.ylabel("duration (sec)", fontsize=15)
plt.title("Median of durations per month (sec) for durations > 3600 sec.", fontsize=20)
Interesting that although the number of rentals with durations greater than 1 hour has increased but the median of the duratons in decreasing. Which shows people are using the rides to commute and not for sight-seeing and descovering the city. Look at this news:
According to them : "The number of visitors is expected to be less than half of last year.
# save the dataframe:
df_final.to_pickle("data_final_for_vis.pkl")